View paper on journal site View paper on Altmetric

Abstract

Genetic clustering algorithms, implemented in programs such as STRUCTURE and ADMIXTURE, have been used extensively in the characterisation of individuals and populations based on genetic data. A successful example is the reconstruction of the genetic history of African Americans as a product of recent admixture between highly differentiated populations. Histories can also be reconstructed using the same procedure for groups that do not have admixture in their recent history, where recent genetic drift is strong or that deviate in other ways from the underlying inference model. Unfortunately, such histories can be misleading. We have implemented an approach, badMIXTURE, to assess the goodness of fit of the model using the ancestry “palettes” estimated by CHROMOPAINTER and apply it to both simulated data and real case studies. Combining these complementary analyses with additional methods that are designed to test specific hypotheses allows a richer and more robust analysis of recent demographic history.

Setup API keys

To use the Twitter API, sign up for a developer account and obtain the necessary API tokens. For security, these are stored in a separate _config.yaml file.

#-----------------------------------------------------------------------------
# API keys
#-----------------------------------------------------------------------------
keys <- yaml.load_file("_config.yaml")
## Warning in readLines(con): incomplete final line found on '_config.yaml'
# oauth for tweetscores functions
my_oauth <- list(consumer_key=keys$consumer_key,
                 consumer_secret=keys$consumer_secret,
                 access_token=keys$access_token,
                 access_token_secret=keys$access_secret)

# set token for rtweet
token <- create_token(
  app = keys$app_name,
  consumer_key = keys$consumer_key,
  consumer_secret = keys$consumer_secret,
  access_token=keys$access_token,
  access_secret=keys$access_secret)

# setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)

Functions for topic modeling

This code defines functions that will be used in generating and formatting data for the LDA model.

#-----------------------------------------------------------------------------
# functions for topic modeling
#-----------------------------------------------------------------------------
# tmls <- get_timelines(plotdat$account[1:4], n = 3200) # testing with timelines instead of bios

get_follower_bios <- function(followers){
  f5k <- unlist(followers)
  f5k <- head(f5k, 10000)
  
  lookup_users(f5k, token=token)
}

# get emoji lookup table
source(paste0(datadir, "/scrapeEmoticons.R"))

# source script directly from from https://github.com/today-is-a-good-day/emojis
# script <- getURL("https://raw.githubusercontent.com/today-is-a-good-day/emojis/master/scrapeEmoticons.R", ssl.verifypeer = FALSE)
# eval(parse(text = script))

lookup <- alltogether %>% 
  mutate(Desc_Sub=tolower(paste0(" emoji", gsub("[+]|-| ", "", Description), " "))) %>% # ensure textualized emoji is separated by space
  mutate(Desc_Sub2=gsub(" ", "", Desc_Sub)) # strip spaces for proper text -> emoji matching 

# define lookup matching functions
# https://stackoverflow.com/questions/31828218/replace-values-in-character-vector-using-lookup-table-in-r
emoji_to_text <- function(x) Reduce(function(x,r) gsub(lookup$Native[r],lookup$Desc_Sub[r],x,fixed=T),seq_len(nrow(lookup)),x)
text_to_emoji <- function(x) Reduce(function(x,r) gsub(lookup$Desc_Sub2[r],lookup$Native[r],x,fixed=T),seq_len(nrow(lookup)),x)

#-----------------------------------------------------------------------------
# function counts number of test account's followers in each of 
# training accounts' follower groups
#-----------------------------------------------------------------------------
match_followers <- function(a1, test_df=training_data_full){
  
  a1 <- unlist(a1)
  a2_list <- test_df$followers
  
  match_counts <- lapply(a2_list, function(x){sum(unlist(x) %in% a1)/length(a1)})
  names(match_counts) <- test_df$account
  
  return(data.frame(match_counts))
}

Load training data for homophily analysis

#-----------------------------------------------------------------------------
# read reference data from scientist/WN classifications
#-----------------------------------------------------------------------------

# basic data frame containing curated usernames of WN
training_data_fh <- paste0(datadir, "/training_data/training_data_handles.rds")
training_data <- readRDS(training_data_fh)

# extended data frame containing user metadata & follower lists
training_data_full_fh <- paste0(datadir, "/training_data/training_data_full2.rds")

if(file.exists(training_data_full_fh)){
  training_data_full <- readRDS(training_data_full_fh)
} else {
  training_user_data <- lookup_users(training_data$account) %>%
    mutate(account=screen_name) %>%
    left_join(training_data, "account")
  
  fc_mod <- ceiling(training_user_data$followers_count/5000)
  
  sleep <- ifelse(sum(fc_mod)>15, 60, 0)
  
  # get followers for accounts in training data
  training_data_full <- training_user_data %>% #head
    rowwise %>% 
    mutate(followers=getFollowers(screen_name=account, oauth=my_oauth, sleep=sleep) %>% 
             data.frame %>% 
             as.list) # tweetscores
  # old version using rtweet API calls--requires fixed n
  # mutate(followers=get_followers(account, n=20000, token=token, retryonratelimit = TRUE) %>% as.list)
  
  saveRDS(training_data_full, training_data_full_fh)
}

Get Altmetric follower lists

The code below will scrape data from Altmetric and Twitter to generate a tibble where each row contains the twitter handle of an account that has tweeted or RTed the specified article, and a vector of all accounts that follow that user.

Because this can be very time-consuming, a .rds file containing this tibble will be stored to disk so analyses can be repeated without needing to re-scrape the data.

#-----------------------------------------------------------------------------
# get data from Altmetric

summary_page <- read_html(article_full_url)
doi <- summary_page %>% 
  html_nodes("div.document-details-table tr:nth-child(3) :nth-child(1)") %>% 
  html_text()

article_doi <- doi[2]

if(grepl("biorxiv", article_full_url)){
  # biorxiv_page <- read_html(paste0("https://www.biorxiv.org/content/", article_doi, "v1"))
  abstract <- biorxiv_page %>% html_nodes("div.abstract #p-2") %>% html_text()  
} else {
  abstract1 <- summary_page %>% html_nodes("div.content-wrapper tr:nth-child(6) :nth-child(1)") %>% html_text()
  abstract <- gsub("\n", "", abstract1[2])
}

# altmetric metadata from API
article_am <- altmetrics(doi = article_doi)
article_df <- altmetric_data(article_am)

# id <- article_df$altmetric_id
# url <- paste0(article_base_url, id, "/twitter/page:1")

twitter_url <- paste0(article_full_url, "/twitter")
twitter_page <- read_html(twitter_url)

# number of pages is the ceiling of total tweets/100
totals <- twitter_page %>% html_nodes("div.text strong") %>% html_text()
npages <- ceiling(as.integer(totals[1])/100)

# loop through pages of tweets on altmetric to get handles of tweeting users

# if(!exists("user_data")){
handles <- c()
ts_df <- data.frame()
for(page in 1:npages){
  # url <- paste0(article_base_url, id, "/twitter/page:", page)
  page_url <- paste0(twitter_url, "/page:", page)
  page <- read_html(page_url)
  
  # get div objects containing handles of tweeting users
  names <- gsub("@", "", html_nodes(page, "div.handle") %>% html_text())
  tweets <- gsub("\n", "", html_nodes(page, "div.content") %>% html_text())
  timestamps <- html_nodes(page, "time") %>% html_attrs() %>% unlist()
  
  ts_df <- bind_rows(ts_df, data.frame(names, timestamps, tweets))
  
  # append to list, removing duplicates and stripping "@"
  handles <- unique(c(handles, gsub("@", "", names)))
}

original <- ts_df %>%
  mutate(tweets=gsub("\"", "", tweets)) %>%
  dplyr::filter(!grepl("RT @", tweets)) %>%
  mutate(tweets=paste0("@", names, ": ", tweets)) %>%
  mutate(tweets=substr(tweets, 1, 100)) %>%
  dplyr::select(tweets) %>%
  dplyr::group_by(tweets) %>%
  count()

rts <- ts_df %>% 
  mutate(tweets=gsub("\"", "", tweets)) %>%
  dplyr::filter(grepl("RT @", tweets)) %>%
  mutate(tweets=gsub("RT ", "", tweets)) %>%
  # mutate(tweets=gsub("RT @[^:]*: ", "", tweets)) %>%
  mutate(tweets=substr(tweets, 1, 100)) %>%
  group_by(tweets) %>% 
  count() %>% 
  arrange(desc(n))

# get user metadata from Twitter API
user_data <- lookup_users(handles)
# }

#-----------------------------------------------------------------------------
# Get follower lists for users
# due to Twitter API limits, this will take at least N minutes, 
# where N is the number of unique users that have tweeted about an article
#
# users with >5,000 followers will require multiple API calls to scrape their
# full follower lists, so a user with 75,000 followers will take 
# the same amount of time to process as 15 users with <5,000 followers each 
# (~15 minutes)
#-----------------------------------------------------------------------------

# altmetric_data_full_fh <- paste0(datadir, "altmetric_data_full.rds")
altmetric_data_full_fh <- paste0(datadir, "/article_data/altmetric_data_full_", article_id, ".rds")

if(file.exists(altmetric_data_full_fh)){
  altmetric_data_full <- readRDS(altmetric_data_full_fh)
} else {

  # get follower metadata from Twitter API
  # sleep interval--if more than 15 API calls will be required, use one call per minute to 
  # minimize weird timeout issues
  fc_mod <- ceiling(user_data$followers_count/5000)
  sleep <- ifelse(sum(fc_mod)>15, 60, 0)
  reset <- TRUE
  
  if(!exists("altmetric_data_full") | reset){
    altmetric_data_full <- tibble(account=character(), followers=list())
  }
  for(user in (nrow(altmetric_data_full)+1):nrow(user_data)){
    
    # refresh OAuth handshake
    # if(user %% 20 == 0){
    #   twitteR::setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
    # }
    
    if(user_data[user,]$protected){
      cat(paste0("User ", user_data[user,]$screen_name, " is protected--skipping\n"))
    } else {
      cat(paste0("Getting follower list for: ", user_data[user,]$screen_name, 
       " (", user, " of ", nrow(user_data), ")...\n"))
    
      altmetric_data_user <- user_data[user,] %>%
        dplyr::select(account=screen_name) %>% #head
        mutate(followers = getFollowers(screen_name=account, oauth=my_oauth, sleep=sleep) %>% 
                 data.frame %>% 
                 as.list)
        
      altmetric_data_full <- bind_rows(list(altmetric_data_full, altmetric_data_user))
    }
  }
  
  saveRDS(altmetric_data_full, altmetric_data_full_fh)
}

Prep data for topic modeling

This code scrapes the bios and account names of the followers for each account (up to 10,000), compiling the bios into a single “document” for each account that has tweeted/RTed a reference to the article. Then the frequencies of each term per document are calculated and used to generate a document term matrix, which serves as the input for the LDA model in the next section.

Terms include emoji and hashtags, but exclude common terms in the stop_words dataset as well as words in a custom stop word list that are frequently seen in bios and are uninformative.

#-----------------------------------------------------------------------------
# topic modeling for bios
#-----------------------------------------------------------------------------
# altmetric_follower_fh <- "/mnt/norbert/home/jedidiah/projects/hbd_meme_network/follower_bios2.rds"
altmetric_follower_fh <- paste0(datadir, "/article_data/follower_bios_", article_id, ".rds")

reset <- TRUE

if(file.exists(altmetric_follower_fh)){
  out_df <- readRDS(altmetric_follower_fh)
} else {
  
  if(!exists("out_df") | reset){
    out_df <- data.frame(account=character(), bios=character())
  }

  for(i in (length(unique(out_df$account))+1):nrow(altmetric_data_full)){

    cat(paste0("Getting follower bios for: ", altmetric_data_full[i,]$account, 
           " (", i, " of ", nrow(altmetric_data_full), ")...\n"))
        
    if(length(unlist(altmetric_data_full[i,]$followers)) > 0){
      bios <- try(get_follower_bios(followers=altmetric_data_full[i,]$followers))
  
      if(!inherits(bios, "try-error")){
        acct_follower_bios <- data.frame(account=altmetric_data_full[i,]$account, bios)
    
        out_df <- bind_rows(list(out_df, acct_follower_bios))  
      } else {
        cat(paste0("Encountered error for user--results will not be included\n"))
      }
    
      Sys.sleep(10)
    }

  }
  
  saveRDS(out_df, altmetric_follower_fh)
}

# convert emoji and hashtags to text, collapse follower bios
bios <- out_df %>%
  dplyr::filter(account_lang=="en") %>%
  dplyr::select(account, bio=description, name) %>%
  # mutate(bio=ifelse(nchar(bio)!=0, bio, "missingbio")) %>%
  # unite(description, c(name, bio), sep=" ") %>%
  mutate(description= paste0(name, " ", bio)) %>%
  mutate(description = emoji_to_text(description)) %>%
  mutate(description = gsub("-", "", description)) %>%
  mutate(description = gsub("#", "hashtag", description)) %>%
  group_by(account) %>%
  summarise(doc=paste(description, collapse = " ")) %>%
  mutate(doc=iconv(doc, 'utf-8', 'ascii', sub=''))

# tokenize, drop spanish/french stop words, URLs, and custom stopword list
custom_stopwords <- c("https", "http", "t.co", "gmail.com", "views", "love", "lover", "tweets",
                      "rts", "follow", "twitter", "endorsement", "fan", "james", "michael",
                      "andrew", "ryan", "chris", "matt", "och", "rt", "opinions", "paul",
                      "endorsements", "account", "life", "john", "david", "social", "retweets",
                      stopwords(kind="en"), stopwords(kind="danish"), stopwords(kind="dutch"), 
                      stopwords(kind="finnish"), stopwords(kind="french"), stopwords(kind="german"),
                      stopwords(kind="hungarian"), stopwords(kind="italian"), stopwords(kind="norwegian"),
                      stopwords(kind="portuguese"), stopwords(kind="russian"), stopwords(kind="spanish"),
                      stopwords(kind="swedish"))

bios_tokenized <- bios %>%
  unnest_tokens(word, doc) %>%
  dplyr::filter(!(word %in% custom_stopwords))

# apply default stopword list and count frequencies
word_counts <- bios_tokenized %>%
  anti_join(stop_words) %>%
  count(account, word, sort = TRUE) %>%
  dplyr::filter(n>=10)

#-----------------------------------------------------------------------------
# TESTING use 2-word tokens?
#-----------------------------------------------------------------------------
# bios_tokenized2 <- bios %>%
#   unnest_tokens(ngram, doc, token = "ngrams", n = 2) %>%
#   dplyr::filter(!(grepl(paste(custom_stopwords, collapse="|"), ngram)))
# 
# ngram_counts <- bios_tokenized2 %>%
#   count(account, ngram, sort = TRUE) %>%
#   dplyr::filter(n>=5) %>%
#   separate(ngram, c("word", "word2"), sep = " ") %>%
#   anti_join(stop_words) %>%
#   dplyr::rename(word1=word, word=word2) %>%
#   anti_join(stop_words) %>%
#   unite(word, c("word1", "word"), sep=" ")

#-----------------------------------------------------------------------------
# run over entire dataset
#-----------------------------------------------------------------------------
bios_dtm <- word_counts %>%
  cast_dtm(account, word, n)

# bios_dtm <- bind_rows(word_counts, ngram_counts) %>%
#   cast_dtm(account, word, n)
# 
# bios_dtm <- ngram_counts %>%
#   cast_dtm(account, word, n)

Estimate Number of Topics

Using the ldatuning package, calculate various goodness-of-fit metrics

result <- FindTopicsNumber(
  bios_dtm,
  topics = seq(from = 2, to = 20, by = 1),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  control = list(seed = 5678),
  method = "VEM",
  # method = "Gibbs",
  mc.cores = 8L,
  verbose = TRUE
)

# FindTopicsNumber_plot(result[result$Deveaud2014!=Inf,])
FindTopicsNumber_plot(result)

Run LDA

This code runs the LDA model to represent the corpus of documents as a mixture of K “topics.” Each topic is represented by a different set of words/terms that frequently co-occur. The gamma values are interpreted as the fraction of each document corresponding to each of the K topics.

bios_lda6 <- LDA(bios_dtm, k = 12, control = list(alpha=0.1, seed = 5678), method="VEM")
# bios_lda6 <- LDA(bios_dtm, k = 6, control = list(seed = 5678), method="Gibbs")
# bios_lda6 <- LDA(bios_dtm, k = 6, control = list(seed = 1234))
# bios_lda

bios_lda_td <- tidy(bios_lda6)
# bios_lda_td

top_terms <- bios_lda_td %>%
  group_by(topic) %>%
  top_n(30, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>% 
  mutate(term = text_to_emoji(term)) %>%
  mutate(term = gsub("hashtag", "#", term))

topics_terms <- top_terms %>% 
  dplyr::select(-beta) %>% 
  # mutate(topic=paste0("t", topic)) %>%
  group_by(topic) %>% 
  summarise(top_10=paste(term, collapse=", ")) %>% 
  ungroup()

bios_lda_gamma <- tidy(bios_lda6, matrix = "gamma") %>%
  rowwise() %>%
  mutate(gamma=gamma+runif(1,0,0.0001))

docs_order <- bios_lda_gamma %>%
  group_by(document) %>%
  arrange(topic, -gamma) %>%
  top_n(1, gamma) %>%
  rename(topic_group = topic) %>%
  dplyr::select(-gamma)

# docs_order <- bios_lda_gamma %>% 
#   spread(topic, gamma) %>%
#   # dplyr::arrange(-`1`, -`2`, -`3`, -`4`, -`5`, -`6`) %>%
#   dplyr::arrange(-`1`, -`2`, -`3`, -`4`, -`5`) %>%
#   # gather(topic, gamma, `1`:`6`) %>% #head
#   gather(topic, gamma, `1`:`5`) %>% #head
#   group_by(document) %>%
#   # arrange(topic, -gamma) %>%
#   top_n(1, gamma) %>%
#   rename(topic_group = topic) %>%
#   dplyr::select(-gamma)

# devtools::install_github("GuangchuangYu/emojifont")
# library(emojifont)
# load.emojifont('OpenSansEmoji.ttf')

Plot topic breakdown by user

This code runs a plot showing the topic breakdowns of each account’s follower base.

cols <- c(brewer.pal(9, "Set1"), brewer.pal(9, "Set3"))

p <- bios_lda_gamma %>%
    # left_join(altmetric_data_full %>% 
    #           rowwise() %>% 
    #           mutate(nfol=length(followers)) %>% 
    #           ungroup() %>%
    #           mutate(pct_fol=nfol/sum(nfol)) %>%
    #           dplyr::select(document=account, pct_fol), by="document") %>%
  mutate(document=factor(document, levels=docs_order$document)) %>%
  left_join(topics_terms, by="topic") %>%
  mutate(topic=paste0(topic, ": ", top_10)) %>%
  mutate(topic=factor(topic, levels=unique(topic))) %>%
  ggplot(aes(x=document, y=gamma, fill=topic))+
  # geom_bar(aes(width=pct_fol), stat="identity", position="stack")+
  geom_bar(stat="identity", position="stack")+
  scale_fill_manual(values=cols)+
  scale_y_continuous(expand = c(0,0))+
  scale_x_discrete(position = "top")+ 
  xlab("Account")+
  theme(legend.position="bottom",
        axis.title.y=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())+
  guides(fill=guide_legend(ncol=1))

p2 <- ggplotly(p) %>%
  layout(legend = list(orientation = "v",   # show entries horizontally
                     xanchor = "center",  # use center of legend as anchor
                     yanchor = "bottom",
                     x = 0, y=-1))

p2
htmlwidgets::saveWidget(p2, 
                        file=paste0(datadir, "/figs/topic_breakdown_by_user_", article_id, ".html"),
                        title=paste0("topic_breakdown_by_user_", article_id))

# ggsave(paste0(datadir, "/topic_breakdown_by_user_", article_id, ".png"), plot=p, width = 18, height=9)

Plot topic breakdown by user, adjusted for # followers

p_weight <- bios_lda_gamma %>%
    left_join(altmetric_data_full %>%
              rowwise() %>%
              mutate(nfol=length(followers)) %>% 
              ungroup() %>%
              mutate(right=cumsum(nfol), left=right-nfol) %>%
              dplyr::select(document=account, left, right), by="document") %>%
  mutate(document=factor(document, levels=docs_order$document)) %>%
  left_join(topics_terms, by="topic") %>%
  mutate(topic=paste0(topic, ": ", top_10)) %>%
  mutate(topic=factor(topic, levels=unique(topic))) %>%
  ggplot(aes(x=document, y=gamma, fill=topic, group=document))+
  # geom_bar(aes(width=pct_fol), stat="identity", position="stack")+
  geom_bar(stat="identity", position="stack")+
  geom_rect(aes(xmin = left, xmax = right, ymin=0, ymax=gamma)) +
  scale_fill_manual(values=cols)+
  scale_y_continuous(expand = c(0,0))+
  facet_wrap(~topic, scales="free", ncol=1)+
  xlab("Account")+
  # theme_classic()+
  theme(legend.position="none",
        axis.title.y=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())+
  guides(fill=guide_legend(ncol=1))

p_weight

ggsave(paste0(datadir, "/figs/topic_breakdown_by_user_weight_", article_id, ".png"), 
       plot=p_weight, width = 12, height=12)

PCA + UMAP

Apply PCA followed by UMAP to get 2D embedding of accounts according to homophily with reference data

#-----------------------------------------------------------------------------
# Generate follower overlap matrix: 
# 1 row per test account, i, 1 column per reference account, j
# each cell contains fraction of i's followers that also follow j
#-----------------------------------------------------------------------------
sim_matrix <- altmetric_data_full %>%
  # sim_matrix <- altmetric_data_full %>%
  dplyr::filter(length(followers)>=10) %>%
  group_by(account) %>%
  nest() %>% 
  mutate(test=map(data, ~match_followers(.$followers))) %>% unnest(test)

saveRDS(sim_matrix, paste0(datadir, "/article_data/sim_matrix_", article_id, ".rds"))

# apply PCA to count matrix
sim_matrix_pca <- sim_matrix %>%
  dplyr::select(-data) %>%
  mutate(sum = rowSums(.[2:41])) %>% 
  dplyr::filter(sum!=0) %>%
  dplyr::select(-sum) %>%
  nest() %>%
  mutate(pca = purrr::map(data, ~prcomp(.x %>% dplyr::select(-account), center = TRUE, scale = TRUE)), 
         pca_tidy = purrr::map2(pca, data, ~broom::augment(.x, data = .y))) #%>% unnest() %>% head

# apply UMAP to PCA data
sim_matrix_umap <- sim_matrix_pca[[3]][[1]] %>% 
  dplyr::select(.fittedPC1:.fittedPC40) %>%
  data.frame() %>%
  umap(n_neighbors=30, random_state=36262643)

Plot UMAP embedding

umap_plotdat <- bind_cols(sim_matrix_pca[[3]][[1]], data.frame(sim_matrix_umap$layout)) %>%
  left_join(user_data %>% dplyr::rename(account=screen_name),
            by="account") %>%
  mutate(wn_mean=rowMeans(dplyr::select(.,vdare:NewRightAmerica), na.rm = TRUE),
         sc_mean=rowMeans(dplyr::select(.,pastramimachine:girlscientist), na.rm = TRUE)) %>%
  mutate(affiliation=log10(wn_mean/(sc_mean+0.001))) %>%
  dplyr::filter(sc_mean != 0 & wn_mean != 0) %>%
  mutate(urls=paste0("https://twitter.com/", account))

# plotdat2 <- plotdat
hdb_clust <- umap_plotdat %>%
  dplyr::select(X1:X2) %>%
  as.matrix() %>%
  hdbscan(x=., minPts=10)

umap_plotdat$cluster <- as.character(hdb_clust$cluster)

plot_embedding <- function(plotdat){
  
  p <- plotdat %>% # merge with user_data to get followers_count + other info
    # replace_na(list(hbd=FALSE)) %>%
    ggplot(aes(x=X1, y=X2, label=account, colour=affiliation))+
    geom_point(aes(size=log(followers_count)), alpha=0.8)+
    scale_colour_gradientn("WN:Scientist Follower Ratio", 
                           colors=rev(brewer.pal(9, "RdBu")), 
                           breaks=seq(-3,3),
                           labels=c("1:1000", "1:100", "1:10","1:1","10:1","100:1","1000:1"),
                           limits=c(-3,3))+
    theme_dark()+
    theme(legend.position="bottom")
  
  ply <- ggplotly(p)
  
  # Clickable points link to profile URL using onRender: https://stackoverflow.com/questions/51681079
  ply$x$data[[1]]$customdata <- plotdat$urls
  #pp  <- add_markers(pp, customdata = ~url)
  plyout <- onRender(ply, "
                     function(el, x) {
                     el.on('plotly_click', function(d) {
                     var url = d.points[0].customdata;
                     //url
                     window.open(url);
                     });
                     }
                     ")

  plyout
}
plot_embedding(umap_plotdat)
htmlwidgets::saveWidget(plot_embedding(umap_plotdat), 
                        file=paste0(datadir, "/figs/homophily_ratio_", article_id, ".html"),
                        title=paste0("homophily_ratio_", article_id))

Plot UMAP embeddings colored by LDA topic

lda_gammas <- bios_lda_gamma %>%
  mutate(document=factor(document, levels=docs_order$document)) %>%
  left_join(topics_terms, by="topic") %>%
  mutate(topic=paste0(topic, ": ", top_10)) %>%
  mutate(topic=factor(topic, levels=unique(topic))) %>%
  group_by(document) %>%
  arrange(topic, -gamma) %>%
  top_n(1, gamma) %>%
  rename(account=document)

p2 <- umap_plotdat %>% 
  left_join(lda_gammas, by="account") %>%
  ggplot(aes(x=X1, y=X2, label=account, colour=topic))+
  geom_point(size=2)+
  scale_colour_manual(values=cols)+
  theme_dark()+
  theme(legend.position="none")#+
  # guides(colour=guide_legend(ncol=1))


ggplotly(p2)  #%>%
  # layout(legend = list(
  #   orientation = "h"
  # )
  # )

htmlwidgets::saveWidget(ggplotly(p2), 
                        file=paste0(datadir, "/figs/homophily_lda_", article_id, ".html"),
                        title=paste0("homophily_lda_", article_id))

Plot PCA of LDA gammas

bios_lda_pca_in <- bios_lda_gamma %>% 
  spread(topic, gamma) %>% 
  dplyr::select(-document) %>% as.matrix

bios_lda_pca1 <- prcomp(bios_lda_pca_in, center=T, scale=T)

bios_lda_pca <- bios_lda_gamma %>% 
  spread(topic, gamma) %>% 
  nest() %>%
  mutate(pca = purrr::map(data, ~prcomp(.x %>% dplyr::select(-document), center = TRUE, scale = TRUE)), 
         pca_tidy = purrr::map2(pca, data, ~broom::augment(.x, data = .y)))

p3 <- bios_lda_pca[[3]][[1]] %>%
  rename(account=document) %>%
  left_join(lda_gammas, by="account") %>% #head
  # inner_join(sim_matrix, by="account") %>%
  left_join(user_data %>% dplyr::rename(account=screen_name),
            by="account") %>%
  # mutate(wn_mean=rowMeans(dplyr::select(.,vdare:NewRightAmerica), na.rm = TRUE),
  #        sc_mean=rowMeans(dplyr::select(.,pastramimachine:girlscientist), na.rm = TRUE)) %>% #head
  # mutate(affiliation=log10(wn_mean/(sc_mean+0.001))) %>%
  # mutate(affiliation_binary=ifelse(affiliation<log10(0.5), 
  #                                  "Science", 
  #                                  ifelse(affiliation>log10(2), 
  #                                         "White Nationalist", 
  #                                         "Neutral"))) %>%
  ggplot(aes(x=.fittedPC1, y=.fittedPC2, colour=topic, label=account))+
  geom_point(size=2, alpha=0.7)+
  scale_colour_manual(values=cols)+
  # scale_shape_manual(values=c(3, 1, 7))+
  # theme_dark()+
  theme(legend.position="bottom")+
  guides(colour=guide_legend(ncol=1))

p3_ply <- ggplotly(p3) %>%
  layout(legend = list(orientation = "v",   # show entries horizontally
                     xanchor = "center",  # use center of legend as anchor
                     yanchor = "bottom",
                     x = 0, y=-1))

p3_ply

htmlwidgets::saveWidget(p3_ply, 
                        file=paste0(datadir, "/figs/lda_pca_", article_id, ".html"),
                        title=paste0("lda_pca_", article_id))

Missing data analysis

bios_m <- out_df %>%
  # dplyr::filter(account_lang=="en") %>%
  dplyr::select(account, bio=description, name) %>%
  mutate(bio=ifelse(nchar(bio)!=0, 0, 1)) %>% 
  # unite(description, c(name, bio), sep=" ") %>%
  group_by(account) %>%
  summarise(tot=n(), nmissing=sum(bio), pct=nmissing/tot) %>% #dim
  # dplyr::rename("document"="account") %>%
  inner_join(lda_gammas, by="account") %>%
  inner_join(umap_plotdat, by="account")


p4 <- bios_m %>%
  mutate(topic_num=gsub(":.*", "", topic)) %>%
  ggplot(aes(x=topic_num, y=pct, colour=topic, label=account))+
  geom_jitter(size=3, alpha=0.6)+
  geom_boxplot(outlier.shape=NA)+
  scale_colour_manual(values=cols)+
  theme(legend.position="bottom",
        axis.title.y=element_blank(),
        axis.text.x=element_blank())+
  guides(colour=guide_legend(ncol=1))

p4_ply <- ggplotly(p4) %>%
  layout(legend = list(orientation = "v",   # show entries horizontally
                     xanchor = "center",  # use center of legend as anchor
                     yanchor = "bottom",
                     x = 0, y=-1))

htmlwidgets::saveWidget(p4_ply, 
                        file=paste0(datadir, "/figs/missing_dist_", article_id, ".html"),
                        title=paste0("missing_dist_", article_id))

p4_ply

Correlation between missing data and WN homophily

p4a <- bios_m %>%
  mutate(topic_num=gsub(":.*", "", topic)) %>%
  dplyr::filter(pct<0.5) %>%
  ggplot(aes(x=pct, y=wn_mean, group=topic_num, colour=topic, label=account))+
  geom_point()+
  geom_smooth(method="lm", se=F)+
  scale_colour_manual(values=cols)+
  # facet_wrap(~topic_num, scales="free")+
  xlab("Fraction of followers with missing bios")+
  ylab("WN Homophily")+
  theme(legend.position="bottom")+
  guides(colour=guide_legend(ncol=1))

p4a_ply <- ggplotly(p4a) %>%
  layout(legend = list(orientation = "v",   # show entries horizontally
                     xanchor = "center",  # use center of legend as anchor
                     yanchor = "bottom",
                     x = 0, y=-1))

htmlwidgets::saveWidget(p4a_ply, 
                        file=paste0(datadir, "/figs/missing_homophily_cor_", article_id, ".html"),
                        title=paste0("missing_homophily_cor_", article_id))

p4a_ply

Hierarchical clustering by cosine similarity

m <- as.matrix(bios_dtm)
# distMatrix <- dist(m, method="euclidean")

sim <- m / sqrt(rowSums(m * m))
sim <- sim %*% t(sim)
distMatrix <- as.dist(1 - sim)

groups <- hclust(distMatrix,method="ward.D")
dend <- as.dendrogram(groups)
dend_data <- dendro_data(dend, type = "rectangle")
label_order <- dend_data$labels %>%
  dplyr::rename("account"="label")

# coldf <- data.frame(topic=c(1,10:12,2:9), col=cols[c(1,10:12,2:9)])
coldf <- data.frame(topic=c(1,10:12,2:9), col=cols[1:12])

lgo <- lda_gammas %>% 
  mutate(topic=as.numeric(gsub(":.*", "", topic))) %>% 
  left_join(coldf, by="topic") %>% 
  ungroup() %>% 
  left_join(label_order, by="account") %>% 
  arrange(x)

# ggdendrogram(groups)
# 
# 
# set("labels_col", "blue")

p_dend <- dend %>% 
   set("labels", "") %>%    # change labels
  # set("labels_col", lgo$col) %>% set("labels_cex", 1) %>%
  set("leaves_pch", 19) %>% set("leaves_cex", 1) %>% set("leaves_col", lgo$col) %>%
  plot

ggsave(paste0(datadir, "/figs/cosine_similarity_", article_id, ".png"), 
       plot=p_weight, width = 12, height=12)

PCA by cosine similarity

dmp <- prcomp(as.matrix(distMatrix), center=TRUE, scale=TRUE)

dmp_df <- dmp$x %>%
  as_tibble(rownames="account") %>%
  inner_join(lda_gammas, by="account")


p5 <- dmp_df %>%
  # mutate(topic_num=gsub(":.*", "", topic)) %>%
  ggplot(aes(x=PC1, y=PC2, colour=topic, label=account))+
  geom_point()+
  scale_colour_manual(values=cols)+
  theme(legend.position="none")+
  guides(colour=guide_legend(ncol=1))

p5_ply <- ggplotly(p5) %>%
  layout(legend = list(orientation = "v",   # show entries horizontally
                     xanchor = "center",  # use center of legend as anchor
                     yanchor = "bottom",
                     x = 0, y=-1))

htmlwidgets::saveWidget(p5_ply, 
                        file=paste0(datadir, "/figs/cosine_pca_", article_id, ".html"),
                        title=paste0("cosine_pca_", article_id))

p5_ply

UMAP by cosine similarity

dmp_umap <- dmp$x %>% as.data.frame() %>%
    umap(n_neighbors=20, random_state=36262643)


dmp_df2 <- dmp_umap$layout %>%
  as_tibble(rownames="account") %>%
  inner_join(lda_gammas, by="account") %>%
  left_join(umap_plotdat, by="account")

p2 <- dmp_df2 %>% 
  ggplot(aes(x=V1, y=V2, label=account, colour=topic))+
  geom_point(aes(size=wn_mean), alpha=0.7)+
  scale_colour_manual(values=cols)+
  theme_dark()+
  theme(legend.position="none")#+
  # guides(colour=guide_legend(ncol=1))

htmlwidgets::saveWidget(ggplotly(p2), 
                        file=paste0(datadir, "/figs/cosine_umap_", article_id, ".html"),
                        title=paste0("cosine_umap_", article_id))

ggplotly(p2)

Scatterpie plot (experimental)

Plot the UMAP embedding with points represented as piecharts showing topic breakdown of each account, not just the top topic

sp_dat <- dmp_df2 %>% 
  dplyr::select(document=account, top_topic=topic, V1, V2) %>% 
  inner_join(bios_lda_gamma, by="document") %>% 
  left_join(topics_terms, by="topic") %>%
  mutate(topic=paste0(topic, ": ", top_10)) %>% 
  dplyr::select(-top_10) %>%
  spread(topic, gamma)

ggplot()+
    geom_scatterpie(aes(x=V1, y=V2, group=document), 
                    data=sp_dat, 
                    cols=names(sp_dat)[5:16], 
                    color=NA,
                    alpha=0.7)+
    scale_fill_manual(values=cols)+
    theme_dark()+
    theme(legend.position="none")

Timeline plot

p_times <- ts_df %>% 
  rename(account=names) %>% 
  left_join(dmp_df2, by="account") %>% 
  group_by(tweets) %>% 
  arrange(timestamps) %>% 
  mutate(order=row_number(), n=n()) %>% 
  dplyr::filter(n>3) %>%
  ungroup() %>%
  ggplot(aes(x=timestamps, y=order, group=tweets, label=account))+
  geom_line(colour="grey80")+
  geom_point(aes(colour=topic, size=wn_mean), alpha=0.5)+
  scale_colour_manual(values=cols)+
  scale_y_log10()+
  scale_x_discrete(breaks=ts_df$timestamps[seq(1, nrow(ts_df), 10)])+
  ylab("Retweet Number")+
  theme_classic()+
  theme(axis.title.x=element_blank(),
    axis.text.x=element_text(size=6, angle=45, hjust=1),
        legend.position="none")

htmlwidgets::saveWidget(ggplotly(p_times), 
                        file=paste0(datadir, "/figs/timeline_", article_id, ".html"),
                        title=paste0("timeline_", article_id))

ggplotly(p_times)

Title similarity (experimental)

# tweets_cleaned <- ts_df %>%
#   rename("account"="names") %>%
#   dplyr::filter(!grepl("RT @", tweets)) %>%
#   mutate(tweets=gsub("http.*", "", tweets)) %>%
#   mutate(tweets=gsub("\"", "", tweets)) %>%
#   mutate(description = emoji_to_text(tweets)) %>%
#   mutate(description = gsub("-", "", description)) %>%
#   mutate(description = gsub("#", "hashtag", description)) %>%
#   left_join(lda_gammas, by="account") %>%
#   group_by(topic) %>%
#   summarise(doc=paste(description, collapse = " ")) %>%
#   mutate(doc=iconv(doc, 'utf-8', 'ascii', sub='')) #%>% #head
  # mutate(tweets=substr(tweets, 1, 100)) %>%
  # dplyr::select(tweets) %>%
  # dplyr::group_by(tweets) %>%
  # count() %>% 
  # tidyr::separate(tweets, into=c("user", "tweet"), sep=": ", remove=FALSE) %>% #head

title_sim <- ts_df %>% 
  # dplyr::filter(!grepl("RT @", tweets)) %>%
  mutate(tweets=gsub("http.*", "", tweets)) %>%
  mutate(tweets=gsub("\"", "", tweets)) %>%
  mutate(description = emoji_to_text(tweets)) %>%
  rename("account"="names") %>% 
  mutate(title_match_score=sim.strings(tweets, article_df$title)) %>% 
  mutate(abstract_match_score=sim.strings(tweets, abstract)) %>% 
  gather(group, value=sim_score, title_match_score:abstract_match_score) %>% #head
  left_join(lda_gammas, by="account") #%>% 
  # group_by(topic) %>% 
  # summarise(tot=mean(title_match)) 


p_title <- title_sim %>% 
  # ggplot(aes(x=topic, y=sim_score, colour=topic, label=tweets))+
  ggplot(aes(x=sim_score, y=topic, fill=topic, label=tweets))+
  # geom_line(colour="grey80")+
  geom_density_ridges(aes(point_color = topic), scale = 4, alpha = .6, jittered_points = TRUE)+ 
  #theme_ridges() +
  # geom_boxplot()+
  # geom_jitter()+
  facet_wrap(~group)+
  # geom_point(aes(colour=topic, size=wn_mean), alpha=0.5)+
  scale_fill_manual(values=cols)+
  scale_colour_manual(values=cols, aesthetics = "point_color")+
  theme_classic()+
  theme(axis.text.y=element_blank(),
    # axis.text.x=element_text(size=6, angle=45, hjust=1),
        legend.position="none")

p_title
## Picking joint bandwidth of 0.0515
## Picking joint bandwidth of 0.108

ggsave(paste0(datadir, "/figs/title_sim_ridges_", article_id, ".png"), 
       plot=p_title, width = 12, height=12)
## Picking joint bandwidth of 0.0515
## Picking joint bandwidth of 0.108
p_title <- title_sim %>% 
  ggplot(aes(x=topic, y=sim_score, colour=topic, label=tweets))+
  # ggplot(aes(x=sim_score, y=topic, fill=topic, label=tweets))+
  # geom_line(colour="grey80")+
  # geom_density_ridges(aes(point_color = topic), scale = 4, alpha = .2, jittered_points = TRUE)+ 
  #theme_ridges() +
  geom_boxplot()+
  geom_jitter()+
  facet_wrap(~group)+
  # geom_point(aes(colour=topic, size=wn_mean), alpha=0.5)+
  scale_colour_manual(values=cols)+
  theme_classic()+
  theme(axis.text.x=element_blank(),
    # axis.text.x=element_text(size=6, angle=45, hjust=1),
        legend.position="none")


htmlwidgets::saveWidget(ggplotly(p_title), 
                        file=paste0(datadir, "/figs/title_sim_", article_id, ".html"),
                        title=paste0("title_sim_", article_id))

ggplotly(p_title) 
# word_counts_tweets <- tweets_cleaned %>%
#   unnest_tokens(word, doc) %>% 
#   dplyr::filter(!(word %in% custom_stopwords)) %>%
#   dplyr::filter(!(word %in% unlist(strsplit(article_df$title, " "))) %>%
#   anti_join(stop_words) %>%
#   count(topic, word, sort = TRUE) %>%
#   dplyr::filter(n>=2)
# 
# #-----------------------------------------------------------------------------
# # run over entire dataset
# #-----------------------------------------------------------------------------
# tweets_dtm <- word_counts_tweets %>%
#   cast_dtm(topic, word, n)
# 
# tweets_lda6 <- LDA(tweets_dtm, k = 5, control = list(alpha=0.1, seed = 5678), method="VEM")
# 
# tweets_lda_td <- tidy(tweets_lda6)
# 
# top_terms_tweets <- tweets_lda_td %>%
#   group_by(topic) %>%
#   top_n(10, beta) %>%
#   ungroup() %>%
#   arrange(topic, -beta) %>% 
#   mutate(term = text_to_emoji(term)) %>%
#   mutate(term = gsub("hashtag", "#", term))
# 
# 
# top_terms_tweets %>% data.frame

# inner_join(get_sentiments("nrc")) %>% #head(20)
#   count(user, sentiment) %>% 
#   spread(sentiment, nn, fill = 0) %>%
#   mutate(sentiment = positive - negative) #%>%
# 
# inner_join(get_sentiments("bing")) %>%
#   count(book, index = linenumber %/% 80, sentiment) %>%
#   spread(sentiment, n, fill = 0) %>%
#   mutate(sentiment = positive - negative)